In [ ]:
%load_ext autoreload
%autoreload 2
import numpy as np
import os
import sys
import cPickle
In [17]:
chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
'0','1','2','3','4','5','6','7','8','9',
' ',',','.',':',';',"'",'!','?','$','%','&','(',')','=','+','-','<EOS>']
chars_to_idx = {}
index = 0
for c in chars :
chars_to_idx[c] = index
index += 1
idx_to_chars = {}
for k,i in chars_to_idx.items():
idx_to_chars[i] = k
In [1]:
def stringToOneHot(s, chars_to_idx, lower=True):
if lower:
s = s.lower()
# Add an UNKNOWN char
# Add the <EOS> at the end
v_seq = np.zeros((len(s)+1, len(chars_to_idx.keys())+1), dtype=np.float16)
for i in range(len(s)):
# Is s[i] a known character?
try:
v_seq[i,chars_to_idx[s[i]]] = 1.0
#If not, then unknown = 1
except KeyError:
v_seq[i, -1] = 1.0
v_seq[-1, chars_to_idx['<EOS>']] = 1.0
return v_seq
def oneHotToString(seq, idx_to_chars):
s = ""
for one_hot_vec in seq:
# Is the index in idx_to_char?
try:
#print np.argmax(one_hot_vec)
s += idx_to_chars[np.argmax(one_hot_vec)]
except KeyError:
s += '<UNK>'
return s
In [19]:
movieQA_folder = os.path.join('.','..','Data','MovieQA')
# Load text files into nupmy arrays;
movie_convs_txt = os.path.join(movieQA_folder, 'movie_conversations.txt')
movie_lines_txt = os.path.join(movieQA_folder, 'movie_lines.txt')
movie_convs_np = np.loadtxt(movie_convs_txt, dtype='string', delimiter=' +++$+++ ', comments=None)
movie_lines_np = np.loadtxt(movie_lines_txt, dtype='string', delimiter=' +++$+++ ', comments=None)
print "Number of conversations : %d" % len(movie_convs_np)
print "Number of lines : %d" % len(movie_lines_np)
In [20]:
# lineID : one_hot_sequence
line_to_one_hot = {}
# lineID : movie character ID
line_to_movie_car = {}
for line in movie_lines_np:
line_to_one_hot[line[0]] = stringToOneHot(line[-1], chars_to_idx, lower=True)
line_to_movie_car[line[0]] = line[1]
In [21]:
#Sanity check
print len(line_to_one_hot.keys())
print oneHotToString(line_to_one_hot['L205'], idx_to_chars)
In [22]:
# Create a list of Q/A pairs.
# For the simplest approach. We should be able to train a mediocre language (character-level) model with this.
# Eventually, this dataset could be more usful for a dialogue model, since most conversations have more than 2 interactions.
qa_pairs = []
for conversation in movie_convs_np:
subID = 0
lines = eval(conversation[-1])
while subID < (len(lines) - 1):
qa_pairs.append((line_to_one_hot[lines[subID]], line_to_one_hot[lines[subID+1]]))
subID += 1
print "Got %d Q/A pairs." % len(qa_pairs)
In [23]:
#Sanity check :
idx = 7
print oneHotToString(qa_pairs[idx][0], idx_to_chars)
print oneHotToString(qa_pairs[idx][1], idx_to_chars)
In [24]:
qa_pairs_pkl = os.path.join(movieQA_folder, 'QA_Pairs.pkl')
with open(qa_pairs_pkl, 'wb') as f:
cPickle.dump({"qa_data":qa_pairs}, f, protocol=cPickle.HIGHEST_PROTOCOL)
In [25]:
# Sanity check:
with open(qa_pairs_pkl, 'rb') as f:
data = cPickle.load(f)
qa_pairs = data["qa_data"]
In [26]:
n_examples = 5
for i in range(n_examples):
idx = np.random.randint(len(qa_pairs))
print oneHotToString(qa_pairs[idx][0], idx_to_chars)
print oneHotToString(qa_pairs[idx][1], idx_to_chars)
print
In [ ]: